import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc
from IPython.display import Image
%matplotlib inline
pwd
# load csv files as dataframes
df_writer = pd.read_csv("original_data\\Movie_Writer.csv")
df_actor = pd.read_csv("original_data\\Movie_Actors.csv")
df_rating = pd.read_csv("original_data\\Movie_AdditionalRating.csv")
df_movie = pd.read_csv("original_data\\Movie_Movies.csv")
df_genre = pd.read_csv("original_data\\Movie_Genres.csv")
First I do a very simple exploratory analysis to understand how all the data is structued and related and I also clean the data in order to put the data into SQL
for i in [df_writer, df_actor, df_rating, df_movie, df_genre]:
print(i.shape)
df_writer.info()
df_writer.head()
df_writer['Responsibility'].value_counts()
df_writer[df_writer['imdbID'].duplicated()]
df_actor.info()
df_actor.head()
df_actor[df_actor['imdbID'].duplicated()]
df_rating.info()
df_rating.head()
df_rating[df_rating['imdbID'].duplicated()]
df_rating[df_rating['imdbID'].duplicated()]
df_movie.info()
df_movie.head(3)
df_movie.tail(3)
df_movie['Type'].value_counts()
df_movie['DVD'].value_counts()
df_movie[df_movie['imdbID'].duplicated()]
df_genre.head()
df_genre.info()
df_genre['Genre'].value_counts()
df_genre[df_genre['imdbID'].duplicated()]
"Unnamed: 0" Column from df_writer, df_actor, df_rating, df_genre¶df_writer.drop("Unnamed: 0", axis = 1, inplace = True)
df_actor.drop("Unnamed: 0", axis = 1, inplace = True)
df_rating.drop("Unnamed: 0", axis = 1, inplace = True)
df_genre.drop("Unnamed: 0", axis = 1, inplace = True)
df_writer.to_csv("cleaned_data\\movie_writer_2.csv", index=False)
df_actor.to_csv("cleaned_data\\movie_actor_2.csv", index=False)
df_rating.to_csv("cleaned_data\\movie_rating_2.csv", index=False)
df_genre.to_csv("cleaned_data\\movie_genre_2.csv", index=False)
All dataframes have imdbID. So it can be used as primary key
df_movie. In order for imdbID to be Primary Key, os there shouldn't be any duplicates in rows and NaN or NA in imdbID column.¶Does imdbID have duplicates or any Na/NaN rows?
df_movie[df_movie.duplicated()]
No duplicates.
df_movie[df_movie['imdbID'].isnull()]
One dummy row.
df_movie = df_movie[df_movie['imdbID'].notnull()]
df_movie.to_csv("cleaned_data\\movie_movies_2.csv", index=False)
There are five data frames in total. The column every data frame has is imdbID. The main table for this data should be Movie_Movies.csv because it has one-to-one match between Title and imdbID. I am going to put the data into SQL now.
Image(filename='movie_record.png')
Image(filename='tables.png')
Image(filename='diagram.png')
conn = pyodbc.connect('Driver={SQL Server};'
'Server=DESKTOP-MB0G47L;'
'Database=test;'
'Trusted_Connection=yes;')
query_first = "SELECT Director, Title, imdbID FROM dbo.movie_movie_2 WHERE Type = 'Movie'" # only moive not series
df_first = pd.read_sql(query_first, conn)
Director and Title columns?¶df_first[df_first['Title'].isnull()]
df_first[df_first['Director'].isnull()]
None rows¶df_first = df_first[df_first['Director'].notnull()]
Image(filename='SQL1.png')
We can see some rows with more than one names.
, separated directors¶df_first['Director'] = df_first['Director'].str.split(",") # make it as a list - comma separated
df_first = df_first.explode('Director') # explode the lists
df_first[df_first['Title'].duplicated()]
(co-director) from elements of the Director column¶df_first['Director'] = df_first['Director'].str.replace(r"\(.*\)","")
df_first['Director'].value_counts().head(10)
df_first['Director'].str.lower().value_counts().head(10)
df_first['Director'].str.strip().value_counts().head(10)
df_first['Director'] = df_first['Director'].str.strip()
Director and Title columns?¶df_first[df_first.duplicated()]
df_first.drop_duplicates(keep='first', inplace =True)
names = list(df_first.groupby('Director')['Title'].count().sort_values(ascending = False).head(10).index)
movies = list(df_first.groupby('Director')['Title'].count().sort_values(ascending = False).head(10).values)
plt.figure(figsize=(14,6));
df_first.groupby('Director')['Title'].count().sort_values(ascending = False).head(10)[::-1].plot.barh(color=(0.9, 0.9, 0.9, 0.9), edgecolor='blue');
plt.title("Top10 directors by moives produced", fontsize=20);
plt.xlabel('Number of Movies', fontsize=14);
plt.ylabel('Directors', fontsize=14);
for count, i in enumerate(movies[::-1]):
plt.text(i, count, str(i), ha='left', va='center');
plt.grid(True);
print(df_first.groupby('Director')['Title'].count().sort_values(ascending = False).head(10))
The director who produced the most movies is Jim Powers with 157 moives. He is incomparable to any one on the Top 10 list. The second and third ranked directors are Dave Fleischer and D.W. Griffth with 109 and 104, respectively. The numbers of the 2nd and 3rd show a huge difference to the Top 1 director Jim Powers. So when it comes to making movies, Jim Powers has an extraordinary achievement. In addition, the directors from Top4 to Top10 have produced quite a similar number of moives, ranging from 83 to 95, showing a meaning gap between the Top3 group and the rest of Top 10 directors.
query_second = "SELECT dbo.movie_movie_2.imdbID, Director, Genre \
FROM dbo.movie_movie_2 \
JOIN dbo.movie_genre_2 \
ON movie_movie_2.imdbID = dbo.movie_genre_2.imdbID \
WHERE Director IS NOT NULL"
df_second = pd.read_sql(query_second, conn)
df_second.info()
df_second.head()
Director column¶def name_cleaner(df):
df['Director'] = df['Director'].str.split(",") # make it as a list - comma separated
df = df.explode('Director') # explode the lists
df['Director'] = df['Director'].str.replace(r"\(.*\)","")
df['Director'] = df['Director'].str.strip()
df.drop_duplicates(keep='first', inplace =True)
return df
df_second_new = name_cleaner(df_second)
df_second_new = df_second_new.pivot_table(index='Director', columns='Genre', values='imdbID', aggfunc= lambda x: len(x))
df_second_new.reset_index(inplace=True)
df_second_new.head(3)
df_top_ten = df_first.groupby('Director', as_index=False)['Title'].count().sort_values(ascending=False, by='Title').head(10)
df_top_ten.reset_index(drop=True, inplace=True)
df_top_ten.columns = ['Director', 'Counts']
df_top_ten and df_second_new - Inner Join¶df_top_ten
df_merged = df_top_ten.merge(df_second_new, left_on='Director', right_on = 'Director')
df_merged.head()
fig, ([ax1, ax2], [ax3, ax4]) = plt.subplots(2, 2, figsize=(16,16))
ax1.pie(df_merged.iloc[0][2:][df_merged.iloc[0][2:].notnull()], labels=df_merged.iloc[0][2:][df_merged.iloc[0][2:].notnull()].index)
ax1.set_title("Jim Powers Top1")
ax2.pie(df_merged.iloc[1][2:][df_merged.iloc[1][2:].notnull()], labels=df_merged.iloc[1][2:][df_merged.iloc[1][2:].notnull()].index)
ax2.set_title("Dave Fleischer Top2")
ax3.pie(df_merged.iloc[2][2:][df_merged.iloc[2][2:].notnull()], labels=df_merged.iloc[2][2:][df_merged.iloc[2][2:].notnull()].index)
ax3.set_title("D.W. Griffith Top3")
ax4.pie(df_merged.iloc[3][2:][df_merged.iloc[3][2:].notnull()], labels=df_merged.iloc[3][2:][df_merged.iloc[3][2:].notnull()].index)
ax4.set_title("Lewin Fitzhamon Top4")
plt.show()
I will take a look at only first four directos from the top. Interestingly, it seems that the Top1 director Jim Powers is an adult film director. Most of his films are in the adult gerne. In contrast, as for the Top 2 director, he's a film director for children as his films are mostly related to Animation and Comedy. For the rest of the directors (D.W. Griffith and Lewin Fitzhamon), it is not so obvious what their films are about. I assume that their work is relatively diverse than first two directos.
query_third = "SELECT DISTINCT movie_movie_2.imdbID, [YEAR], Genre \
FROM dbo.movie_movie_2 \
FULL JOIN dbo.movie_genre_2 \
ON dbo.movie_movie_2.imdbID = movie_genre_2.imdbID \
WHERE Genre IS NOT NULL"
df_third = pd.read_sql(query_third, conn)
df_third.info()
df_third.head(3)
df_third['Genre'].value_counts()
df_third['YEAR'].value_counts().tail(50)
df_third = df_third[(df_third['YEAR'] != '2021') & (df_third['YEAR'] != '2021.0') & (df_third['YEAR'] != '2022') & (df_third['YEAR'] != '2010–2013') & (df_third['YEAR'] != '2007–2011') & (df_third['YEAR'] != '2003–2005') & (df_third['YEAR'] != '2012–2016') & (df_third['YEAR'] != '2023')]
YEAR into int¶df_third['YEAR'] = df_third['YEAR'].astype(float).astype(int)
df_third = df_third[df_third['YEAR'] <= 2017]
plt.figure(figsize=(17,8));
df_third.groupby(['YEAR'])['imdbID'].count().plot.bar();
plt.ylabel('Counts', fontsize=14);
plt.xlabel('Year', fontsize=14);
plt.title('Movie by year', fontsize=18);
df_third_new = df_third.pivot_table(index='YEAR', columns='Genre', values='imdbID', aggfunc= lambda x: len(x))
df_third_new.head(3)
df_third_new.describe()
import dash
import plotly.graph_objects as go
import plotly.offline as py
data_bar_list = list(df_third_new)
data_bar = [go.Bar(
x=df_third_new.index,
y=df_third_new[genre].values,
name=genre,
) for genre in data_bar_list]
layout_bar = go.Layout(
title='Movie Genres by Year',
barmode='stack',
yaxis_title='Number of movies',
xaxis_title='Year',
legend=dict(
x=-0.1,
y=-0.1,
orientation="h")
)
fig_bar = go.Figure(data=data_bar, layout=layout_bar)
fig_bar.update_layout(
autosize=False,
width=1000,
height=700
)
fig_bar.write_html("file.html")
The number of films produced per year has been increasing since 1920. One interesting fact from the plot above is that, for some reason, there was a short film boom between 1905 and 1919. As can be seen, there was a peak around 1910, then it drastically decreased. It seems that there might be an negative event that affected the film industry. So the early development of the film industry was ended due to the event. Since then the movie industry had a long plateau until 1980s. However, all of a sudden in 1990s the industry started booming and the number of films has been exponentially increasing until 2015. The most noticable genres in trend are drama, comedy and documentary (except for short), while one of the decreasing gernes is adult.